{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a3556561",
   "metadata": {},
   "source": [
    "| [02_lexical_analysis/01_中文分词工具.ipynb](https://github.com/shibing624/nlp-tutorial/tree/main/02_lexical_analysis/01_中文分词工具.ipynb)  | 中文分词工具  |[Open In Colab](https://colab.research.google.com/github/shibing624/nlp-tutorial/blob/main/02_lexical_analysis/01_中文分词工具.ipynb) |\n",
    "\n",
    "\n",
    "# 中文分词工具\n",
    "切词工具，比较多，包括jieba，pkuseg，fastHan，HanLP等。\n",
    "\n",
    "都可以用pip安装，如：\n",
    "pip install jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ac956ce5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['一个', '傻子', '在', '北京', ',', ' ', '哈尔滨工业大学', '迎来', '100', '年', '华诞', '，', '周华', '主持', '了', '会议', '。', '乐视', '赞助', '了', '会议']\n"
     ]
    }
   ],
   "source": [
    "import jieba\n",
    "\n",
    "text = \"一个傻子在北京, 哈尔滨工业大学迎来100年华诞，周华主持了会议。乐视赞助了会议\"\n",
    "print(jieba.lcut(text))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd1a76b5",
   "metadata": {},
   "source": [
    "词性标注结果："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "063548a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[pair('一个', 'm'), pair('傻子', 'n'), pair('在', 'p'), pair('北京', 'ns'), pair(',', 'x'), pair(' ', 'x'), pair('哈尔滨工业大学', 'nt'), pair('迎来', 'v'), pair('100', 'm'), pair('年', 'm'), pair('华诞', 'a'), pair('，', 'x'), pair('周华', 'nr'), pair('主持', 'v'), pair('了', 'ul'), pair('会议', 'n'), pair('。', 'x'), pair('乐视', 'n'), pair('赞助', 'v'), pair('了', 'ul'), pair('会议', 'n')]\n"
     ]
    }
   ],
   "source": [
    "import jieba.posseg\n",
    "print(jieba.posseg.lcut(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ee2ef74",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pkuseg\n",
    "\n",
    "seg = pkuseg.pkuseg()  # 以默认配置加载模型\n",
    "text = seg.cut(text)  # 进行分词\n",
    "print(text)\n",
    "\n",
    "seg = pkuseg.pkuseg(postag=True)  # 以默认配置加载模型\n",
    "text = seg.cut(text)  # 进行分词\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb9a30de",
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastHan import FastHan\n",
    "\n",
    "model = FastHan()\n",
    "sentence = \"郭靖是金庸笔下的一名男主。\"\n",
    "answer = model(sentence, target=\"Parsing\")\n",
    "print(answer)\n",
    "answer = model(sentence, target=\"NER\")\n",
    "print(answer)\n",
    "\n",
    "sentence = \"一个苹果。\"\n",
    "print(model(sentence, 'CWS'))\n",
    "model.set_cws_style('cnc')\n",
    "print(model(sentence, 'CWS'))\n",
    "\n",
    "sentence = [\"我爱踢足球。\", \"林丹是冠军\"]\n",
    "answer = model(sentence, 'Parsing')\n",
    "for i, sentence in enumerate(answer):\n",
    "    print(i)\n",
    "    for token in sentence:\n",
    "        print(token, token.pos, token.head, token.head_label)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d563123",
   "metadata": {},
   "source": [
    "本节完。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95aa9598",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}